home *** CD-ROM | disk | FTP | other *** search
/ PCGUIA 127 / PC Guia 127.iso / Software / Produtividade / OpenOffice.org 2.0.1 / openofficeorg4.cab / test_robotparser.py < prev    next >
Text File  |  2005-11-19  |  3KB  |  143 lines

  1. import unittest, StringIO, robotparser
  2. from test import test_support
  3.  
  4. class RobotTestCase(unittest.TestCase):
  5.     def __init__(self, index, parser, url, good, agent):
  6.         unittest.TestCase.__init__(self)
  7.         if good:
  8.             self.str = "RobotTest(%d, good, %s)" % (index, url)
  9.         else:
  10.             self.str = "RobotTest(%d, bad, %s)" % (index, url)
  11.         self.parser = parser
  12.         self.url = url
  13.         self.good = good
  14.         self.agent = agent
  15.  
  16.     def runTest(self):
  17.         if isinstance(self.url, tuple):
  18.             agent, url = self.url
  19.         else:
  20.             url = self.url
  21.             agent = self.agent
  22.         if self.good:
  23.             self.failUnless(self.parser.can_fetch(agent, url))
  24.         else:
  25.             self.failIf(self.parser.can_fetch(agent, url))
  26.  
  27.     def __str__(self):
  28.         return self.str
  29.  
  30. tests = unittest.TestSuite()
  31.  
  32. def RobotTest(index, robots_txt, good_urls, bad_urls,
  33.               agent="test_robotparser"):
  34.  
  35.     lines = StringIO.StringIO(robots_txt).readlines()
  36.     parser = robotparser.RobotFileParser()
  37.     parser.parse(lines)
  38.     for url in good_urls:
  39.         tests.addTest(RobotTestCase(index, parser, url, 1, agent))
  40.     for url in bad_urls:
  41.         tests.addTest(RobotTestCase(index, parser, url, 0, agent))
  42.  
  43. # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
  44.  
  45. # 1.
  46. doc = """
  47. User-agent: *
  48. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  49. Disallow: /tmp/ # these will soon disappear
  50. Disallow: /foo.html
  51. """
  52.  
  53. good = ['/','/test.html']
  54. bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
  55.  
  56. RobotTest(1, doc, good, bad)
  57.  
  58. # 2.
  59. doc = """
  60. # robots.txt for http://www.example.com/
  61.  
  62. User-agent: *
  63. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  64.  
  65. # Cybermapper knows where to go.
  66. User-agent: cybermapper
  67. Disallow:
  68.  
  69. """
  70.  
  71. good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
  72. bad = ['/cyberworld/map/index.html']
  73.  
  74. RobotTest(2, doc, good, bad)
  75.  
  76. # 3.
  77. doc = """
  78. # go away
  79. User-agent: *
  80. Disallow: /
  81. """
  82.  
  83. good = []
  84. bad = ['/cyberworld/map/index.html','/','/tmp/']
  85.  
  86. RobotTest(3, doc, good, bad)
  87.  
  88. # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
  89.  
  90. # 4.
  91. doc = """
  92. User-agent: figtree
  93. Disallow: /tmp
  94. Disallow: /a%3cd.html
  95. Disallow: /a%2fb.html
  96. Disallow: /%7ejoe/index.html
  97. """
  98.  
  99. good = [] # XFAIL '/a/b.html'
  100. bad = ['/tmp','/tmp.html','/tmp/a.html',
  101.        '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
  102.        '/~joe/index.html'
  103.        ]
  104.  
  105. RobotTest(4, doc, good, bad, 'figtree')
  106. RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
  107.  
  108. # 6.
  109. doc = """
  110. User-agent: *
  111. Disallow: /tmp/
  112. Disallow: /a%3Cd.html
  113. Disallow: /a/b.html
  114. Disallow: /%7ejoe/index.html
  115. """
  116.  
  117. good = ['/tmp',] # XFAIL: '/a%2fb.html'
  118. bad = ['/tmp/','/tmp/a.html',
  119.        '/a%3cd.html','/a%3Cd.html',"/a/b.html",
  120.        '/%7Ejoe/index.html']
  121.  
  122. RobotTest(6, doc, good, bad)
  123.  
  124. # From bug report #523041
  125.  
  126. # 7.
  127. doc = """
  128. User-Agent: *
  129. Disallow: /.
  130. """
  131.  
  132. good = ['/foo.html']
  133. bad = [] # Bug report says "/" should be denied, but that is not in the RFC
  134.  
  135. RobotTest(7, doc, good, bad)
  136.  
  137. def test_main():
  138.     test_support.run_suite(tests)
  139.  
  140. if __name__=='__main__':
  141.     test_support.Verbose = 1
  142.     test_support.run_suite(tests)
  143.